#import necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import warnings
sns.set_style('darkgrid')
pd.set_option("display.max_rows", 500)
warnings.filterwarnings('ignore')
%matplotlib inline
plt.rcParams['figure.figsize'] = [16, 12]
#set path to data and import dataset into DF
fpath = 'C:/Users/evnca/Documents/DSBA/capstone/capstone_project_data/'
fname = 'house_price_data'
fext = '.csv'
raw_data = pd.read_csv(os.path.join(fpath+fname+fext))
data = raw_data.copy(deep=True)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 21613 entries, 0 to 21612 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 cid 21613 non-null int64 1 dayhours 21613 non-null object 2 price 21613 non-null int64 3 room_bed 21505 non-null float64 4 room_bath 21505 non-null float64 5 living_measure 21596 non-null float64 6 lot_measure 21571 non-null float64 7 ceil 21571 non-null object 8 coast 21612 non-null object 9 sight 21556 non-null float64 10 condition 21556 non-null object 11 quality 21612 non-null float64 12 ceil_measure 21612 non-null float64 13 basement 21612 non-null float64 14 yr_built 21612 non-null object 15 yr_renovated 21613 non-null int64 16 zipcode 21613 non-null int64 17 lat 21613 non-null float64 18 long 21613 non-null object 19 living_measure15 21447 non-null float64 20 lot_measure15 21584 non-null float64 21 furnished 21584 non-null float64 22 total_area 21584 non-null object dtypes: float64(12), int64(4), object(7) memory usage: 3.8+ MB
#examine head and tail of data
data.head()
| cid | dayhours | price | room_bed | room_bath | living_measure | lot_measure | ceil | coast | sight | ... | basement | yr_built | yr_renovated | zipcode | lat | long | living_measure15 | lot_measure15 | furnished | total_area | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3876100940 | 20150427T000000 | 600000 | 4.0 | 1.75 | 3050.0 | 9440.0 | 1 | 0 | 0.0 | ... | 1250.0 | 1966 | 0 | 98034 | 47.7228 | -122.183 | 2020.0 | 8660.0 | 0.0 | 12490 |
| 1 | 3145600250 | 20150317T000000 | 190000 | 2.0 | 1.00 | 670.0 | 3101.0 | 1 | 0 | 0.0 | ... | 0.0 | 1948 | 0 | 98118 | 47.5546 | -122.274 | 1660.0 | 4100.0 | 0.0 | 3771 |
| 2 | 7129303070 | 20140820T000000 | 735000 | 4.0 | 2.75 | 3040.0 | 2415.0 | 2 | 1 | 4.0 | ... | 0.0 | 1966 | 0 | 98118 | 47.5188 | -122.256 | 2620.0 | 2433.0 | 0.0 | 5455 |
| 3 | 7338220280 | 20141010T000000 | 257000 | 3.0 | 2.50 | 1740.0 | 3721.0 | 2 | 0 | 0.0 | ... | 0.0 | 2009 | 0 | 98002 | 47.3363 | -122.213 | 2030.0 | 3794.0 | 0.0 | 5461 |
| 4 | 7950300670 | 20150218T000000 | 450000 | 2.0 | 1.00 | 1120.0 | 4590.0 | 1 | 0 | 0.0 | ... | 0.0 | 1924 | 0 | 98118 | 47.5663 | -122.285 | 1120.0 | 5100.0 | 0.0 | 5710 |
5 rows × 23 columns
data.tail()
| cid | dayhours | price | room_bed | room_bath | living_measure | lot_measure | ceil | coast | sight | ... | basement | yr_built | yr_renovated | zipcode | lat | long | living_measure15 | lot_measure15 | furnished | total_area | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 21608 | 203600600 | 20150310T000000 | 685530 | 4.0 | 2.50 | 3130.0 | 60467.0 | 2 | 0 | 0.0 | ... | 0.0 | 1996 | 0 | 98014 | 47.6618 | -121.962 | 2780.0 | 44224.0 | 1.0 | 63597 |
| 21609 | 625049281 | 20140521T000000 | 535000 | 2.0 | 1.00 | 1030.0 | 4841.0 | 1 | 0 | 0.0 | ... | 110.0 | 1939 | 0 | 98103 | 47.6860 | -122.341 | 1530.0 | 4944.0 | 0.0 | 5871 |
| 21610 | 424069018 | 20140905T000000 | 998000 | 3.0 | 3.75 | 3710.0 | 34412.0 | 2 | 0 | 0.0 | ... | 800.0 | 1978 | 0 | 98075 | 47.5888 | -122.04 | 2390.0 | 34412.0 | 1.0 | 38122 |
| 21611 | 7258200055 | 20150206T000000 | 262000 | 4.0 | 2.50 | 1560.0 | 7800.0 | 2 | 0 | 0.0 | ... | 0.0 | 1997 | 0 | 98168 | 47.5140 | -122.316 | 1160.0 | 7800.0 | 0.0 | 9360 |
| 21612 | 8805900430 | 20141229T000000 | 1150000 | 4.0 | 2.50 | 1940.0 | 4875.0 | 2 | 0 | 0.0 | ... | 0.0 | 1925 | 0 | 98112 | 47.6427 | -122.304 | 1790.0 | 4875.0 | 1.0 | 6815 |
5 rows × 23 columns
#get the total number of observations
data.shape
(21613, 23)
#look for missing values
data.isnull().sum()
cid 0 dayhours 0 price 0 room_bed 108 room_bath 108 living_measure 17 lot_measure 42 ceil 42 coast 1 sight 57 condition 57 quality 1 ceil_measure 1 basement 1 yr_built 1 yr_renovated 0 zipcode 0 lat 0 long 0 living_measure15 166 lot_measure15 29 furnished 29 total_area 29 dtype: int64
#show as a percent
data.isnull().sum() / data.shape[0] * 100
cid 0.000000 dayhours 0.000000 price 0.000000 room_bed 0.499699 room_bath 0.499699 living_measure 0.078656 lot_measure 0.194327 ceil 0.194327 coast 0.004627 sight 0.263730 condition 0.263730 quality 0.004627 ceil_measure 0.004627 basement 0.004627 yr_built 0.004627 yr_renovated 0.000000 zipcode 0.000000 lat 0.000000 long 0.000000 living_measure15 0.768056 lot_measure15 0.134179 furnished 0.134179 total_area 0.134179 dtype: float64
#check for duplicated observations
data.duplicated().sum()
0
#duplicates in CID, are these different observations for the same property or are they truly duplicated observations?
data['cid'].duplicated().sum()
177
dup_series = data['cid'].duplicated(keep="first")
print(data[dup_series]["cid"])
641 8121100395 2138 4154300296 2735 1788900230 2853 9136103130 3581 7227800055 3598 1954420170 3849 2560801222 4191 2787460720 4465 4435000705 4627 1250201165 4682 3332000615 4963 3558900590 5399 7961500010 5823 7899800045 6511 7520000520 6677 3432501415 6936 7977201065 7103 2212200100 7433 1254200015 7473 722039087 7495 1446403850 7546 8910500150 7709 3333002450 7813 1237500540 7967 5127001320 8136 7657000540 8313 8807810890 8342 795000620 8447 9238500040 8638 7853400250 8676 6021500970 9041 1901600090 9151 1036400200 9542 7893805650 9719 1423049019 10023 3578401060 10243 5101405604 10561 5430300171 10562 5083000375 10573 4443800385 10839 7972000010 10992 2726049071 11003 3883800011 11018 9828200460 11144 9407110710 11208 3262300940 11275 1219000473 11399 2019200220 11410 8161020060 11463 1788800630 11487 5101402435 11791 4204400098 11828 7504021310 11949 1922059278 12319 4202400078 12363 2206700215 12420 1450100390 12506 3630120700 12522 641900050 12690 2143700830 12736 8648900110 12759 2619920170 13186 6669020290 13275 4139480200 13326 9834200305 13523 9834200885 13531 2767603612 13688 5054800110 13824 2568300045 13925 2044500213 13982 726049190 14067 2767602141 14125 7230400400 14196 4364700600 14403 5249801440 14441 526059224 14585 1231000510 14724 1000102 14899 3528000040 15059 9211500620 15108 2892700041 15140 5132000140 15193 3271300955 15194 7856400240 15314 6143000020 15427 5332200530 15480 6117501820 15587 8832900780 15695 3904100089 15758 9809000020 15815 9250900104 15929 8129700644 15976 1217000340 16011 4305200070 16105 6632900574 16171 123039336 16351 8820903380 16375 1995200200 16413 9222400605 16454 2724049222 16503 2561340020 16566 3523069060 16629 3293700496 16638 7409700215 16669 7853420110 16675 723049156 16736 3323059027 16828 4345000510 16843 251300110 16884 7983000200 17031 1721801010 17044 7387500235 17179 8564860270 17220 1974300020 17320 2621600015 17341 302000375 17350 7167000040 17411 7520000695 17425 1823049202 17562 1232000810 17637 8103000110 17654 795000620 17680 5536100020 17683 3739500096 17736 6381500170 17777 1630700361 17787 2023049218 18066 3395040550 18163 643300040 18170 3598600049 18199 1545800290 18239 705730280 18291 9353300600 18470 4222310010 18531 1568100300 18554 1524079093 18745 6021501535 18787 1781500435 19131 4302201085 19160 2473380920 19355 8651510380 19373 937000330 19394 324000530 19606 109200390 19714 6308000010 19720 1432400120 19743 7200179 19748 1523049207 19770 6141100320 19880 5417600130 19903 7129304540 19967 8682262400 19974 8651402750 20111 1825069031 20181 3303000130 20222 8062900070 20332 8645530010 20482 7888000390 20507 2228900270 20528 7701960990 20531 6751300375 20570 4139440480 20721 5282200015 20842 7524400250 20978 8945100320 20981 2231500030 21006 3969300030 21021 6623400187 21148 3185600040 21272 6791200120 21306 7856400300 21351 4139420590 21371 6300000226 21406 1139600270 21430 3935900232 21453 2422049104 21466 4031000520 Name: cid, dtype: int64
#check observations to ensure they're not duplicates and that cid is not a unique identifier
rows = data.loc[data['cid'] == 6300000226]
print(rows)
cid dayhours price room_bed room_bath \
5970 6300000226 20150504T000000 380000 4.0 1.0
21371 6300000226 20140626T000000 240000 4.0 1.0
living_measure lot_measure ceil coast sight ... basement yr_built \
5970 1200.0 2171.0 1.5 0 0.0 ... 0.0 1933
21371 1200.0 2171.0 1.5 0 0.0 ... 0.0 1933
yr_renovated zipcode lat long living_measure15 \
5970 0 98133 47.7076 -122.342 1130.0
21371 0 98133 47.7076 -122.342 1130.0
lot_measure15 furnished total_area
5970 1598.0 0.0 3371
21371 1598.0 0.0 3371
[2 rows x 23 columns]
#check on one more set of observations
rows2 = data.loc[data['cid'] == 4202400078]
print(rows2)
cid dayhours price room_bed room_bath \
5235 4202400078 20150128T000000 175000 2.0 1.0
12319 4202400078 20150428T000000 335000 2.0 1.0
living_measure lot_measure ceil coast sight ... basement yr_built \
5235 1410.0 7000.0 1 0 0.0 ... 0.0 1968
12319 1410.0 7000.0 1 0 0.0 ... 0.0 1968
yr_renovated zipcode lat long living_measure15 \
5235 0 98055 47.4908 -122.223 1540.0
12319 0 98055 47.4908 -122.223 1540.0
lot_measure15 furnished total_area
5235 6000.0 0.0 8410
12319 6000.0 0.0 8410
[2 rows x 23 columns]
# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=True, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
#Let's define a function to plot a nice-looking normalized histogram, including the standard deviation and a kde plot.
def pretty_dist(data, feature, xlimits=None, ylimits=None):
#make histogram
sns.histplot(data[feature], color='teal')
#plot lines for mean, median, and standard deviation range
plt.axvline(data[feature].mean(), color='navy', linestyle="--", label=r"$\mu_{\bar{x}}$")
plt.axvline(data[feature].median(), color='black', linestyle="-.", label="Median")
plt.axvline(data[feature].mean() + data[feature].std(),
color='red', linestyle=":", label=r"$\mu_{\bar{x}} \pm \sigma_{\bar{x}}$")
plt.axvline(data[feature].mean() - data[feature].std(),
color='red', linestyle=":")
plt.legend(fontsize=16)
#change labels if applicable and adjust font sizes on labels
plt.xlabel(feature, fontsize=14)
plt.ylabel('Frequency', fontsize=14)
#adjust tickmark font size
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
#set limits on graph if applicable
if xlimits is not None:
plt.xlim(xlimits)
if ylimits is not None:
plt.ylim(ylimits)
fivepoint_stats(data, feature)
def fivepoint_stats(data, feature):
#print statistics for easy recording
print("Min: %.3f" % data[feature].min())
print("Mean: %.3f" % data[feature].mean())
print("Median: %.3f" % data[feature].median())
print("Sigma: %.3f" % data[feature].std())
print("Max: %.3f" % data[feature].max())
#get another look at the feature names for plotting and cleanup dayhours
data.describe(include='all')
| cid | dayhours | price | room_bed | room_bath | living_measure | lot_measure | ceil | coast | sight | ... | basement | yr_built | yr_renovated | zipcode | lat | long | living_measure15 | lot_measure15 | furnished | total_area | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.161300e+04 | 21613 | 2.161300e+04 | 21505.000000 | 21505.000000 | 21596.000000 | 2.157100e+04 | 21571 | 21612 | 21556.000000 | ... | 21612.000000 | 21612 | 21613.000000 | 21613.000000 | 21613.000000 | 21613 | 21447.000000 | 21584.000000 | 21584.000000 | 21584 |
| unique | NaN | 372 | NaN | NaN | NaN | NaN | NaN | 7 | 3 | NaN | ... | NaN | 117 | NaN | NaN | NaN | 753 | NaN | NaN | NaN | 11145 |
| top | NaN | 20140623T000000 | NaN | NaN | NaN | NaN | NaN | 1 | 0 | NaN | ... | NaN | 2014 | NaN | NaN | NaN | -122.29 | NaN | NaN | NaN | $ |
| freq | NaN | 142 | NaN | NaN | NaN | NaN | NaN | 10647 | 21421 | NaN | ... | NaN | 559 | NaN | NaN | NaN | 116 | NaN | NaN | NaN | 39 |
| mean | 4.580302e+09 | NaN | 5.401822e+05 | 3.371355 | 2.115171 | 2079.860761 | 1.510458e+04 | NaN | NaN | 0.234366 | ... | 291.522534 | NaN | 84.402258 | 98077.939805 | 47.560053 | NaN | 1987.065557 | 12766.543180 | 0.196720 | NaN |
| std | 2.876566e+09 | NaN | 3.673622e+05 | 0.930289 | 0.770248 | 918.496121 | 4.142362e+04 | NaN | NaN | 0.766438 | ... | 442.580840 | NaN | 401.679240 | 53.505026 | 0.138564 | NaN | 685.519629 | 27286.987107 | 0.397528 | NaN |
| min | 1.000102e+06 | NaN | 7.500000e+04 | 0.000000 | 0.000000 | 290.000000 | 5.200000e+02 | NaN | NaN | 0.000000 | ... | 0.000000 | NaN | 0.000000 | 98001.000000 | 47.155900 | NaN | 399.000000 | 651.000000 | 0.000000 | NaN |
| 25% | 2.123049e+09 | NaN | 3.219500e+05 | 3.000000 | 1.750000 | 1429.250000 | 5.040000e+03 | NaN | NaN | 0.000000 | ... | 0.000000 | NaN | 0.000000 | 98033.000000 | 47.471000 | NaN | 1490.000000 | 5100.000000 | 0.000000 | NaN |
| 50% | 3.904930e+09 | NaN | 4.500000e+05 | 3.000000 | 2.250000 | 1910.000000 | 7.618000e+03 | NaN | NaN | 0.000000 | ... | 0.000000 | NaN | 0.000000 | 98065.000000 | 47.571800 | NaN | 1840.000000 | 7620.000000 | 0.000000 | NaN |
| 75% | 7.308900e+09 | NaN | 6.450000e+05 | 4.000000 | 2.500000 | 2550.000000 | 1.068450e+04 | NaN | NaN | 0.000000 | ... | 560.000000 | NaN | 0.000000 | 98118.000000 | 47.678000 | NaN | 2360.000000 | 10087.000000 | 0.000000 | NaN |
| max | 9.900000e+09 | NaN | 7.700000e+06 | 33.000000 | 8.000000 | 13540.000000 | 1.651359e+06 | NaN | NaN | 4.000000 | ... | 4820.000000 | NaN | 2015.000000 | 98199.000000 | 47.777600 | NaN | 6210.000000 | 871200.000000 | 1.000000 | NaN |
11 rows × 23 columns
#extract timestamp and remove from sale date, since timestamp data is not included for the observations
split_datetime = data['dayhours'].str.split('T')
data['dayhours'] = split_datetime.str[0]
print(data['dayhours'])
0 20150427
1 20150317
2 20140820
3 20141010
4 20150218
...
21608 20150310
21609 20140521
21610 20140905
21611 20150206
21612 20141229
Name: dayhours, Length: 21613, dtype: object
#convert dayhours feature to a datetime
data['dayhours'] = pd.to_datetime(data['dayhours'], format='%Y%m%d')
#make sure it worked
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 21613 entries, 0 to 21612 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 cid 21613 non-null int64 1 dayhours 21613 non-null datetime64[ns] 2 price 21613 non-null int64 3 room_bed 21505 non-null float64 4 room_bath 21505 non-null float64 5 living_measure 21596 non-null float64 6 lot_measure 21571 non-null float64 7 ceil 21571 non-null object 8 coast 21612 non-null object 9 sight 21556 non-null float64 10 condition 21556 non-null object 11 quality 21612 non-null float64 12 ceil_measure 21612 non-null float64 13 basement 21612 non-null float64 14 yr_built 21612 non-null object 15 yr_renovated 21613 non-null int64 16 zipcode 21613 non-null int64 17 lat 21613 non-null float64 18 long 21613 non-null object 19 living_measure15 21447 non-null float64 20 lot_measure15 21584 non-null float64 21 furnished 21584 non-null float64 22 total_area 21584 non-null object dtypes: datetime64[ns](1), float64(12), int64(4), object(6) memory usage: 3.8+ MB
#let's break down the sale date into year, month, day for ease of visualization.
#we won't necessarily do this for building the model, but it can help us gain insight into the data
#by looking separately at some of the components.
data["sale_year"] = data['dayhours'].dt.year
data["sale_month"] = data['dayhours'].dt.month
data["sale_day"] = data['dayhours'].dt.day
data.columns
Index(['cid', 'dayhours', 'price', 'room_bed', 'room_bath', 'living_measure',
'lot_measure', 'ceil', 'coast', 'sight', 'condition', 'quality',
'ceil_measure', 'basement', 'yr_built', 'yr_renovated', 'zipcode',
'lat', 'long', 'living_measure15', 'lot_measure15', 'furnished',
'total_area', 'sale_year', 'sale_month', 'sale_day'],
dtype='object')
#visualizing the sale dates in the data by year
labeled_barplot(data, 'sale_year', perc=True)
Just over two-thirds of our data come from home sales that occurred in 2014, with the remaining third due to home sales in 2015.
#visualizing the sale dates in the data by month
labeled_barplot(data, 'sale_month', perc=True)
According to our data, home sales peak during the late Spring and early Summer months, with a high point of 11.2% of home sales in our data occuring in May. Home sales then slowy decline throughout Fall and into Winter, with a sharp dip in January of only 4.5% of sales.
#visualizing the sale dates in the data by day
labeled_barplot(data, 'sale_day', perc=True)
#attempt to create a time series forecast of the sales data.
data['dayhours'].value_counts().plot(figsize = (18, 12))
<AxesSubplot:>
sns.countplot(data["dayhours"].dt.month, hue=data["dayhours"].dt.year, data=data)
<AxesSubplot:xlabel='dayhours', ylabel='count'>
These last two graphs aren't the prettiest, but they help us understand something fundamental about the time frame over which this data was taken: a one-year period beggining May 2014 up until May 2015. We can dig further into the data to figure out the dates over which the data was collected.
histogram_boxplot(data, 'price')
pretty_dist(data, 'price')
Min: 75000.000 Mean: 540182.159 Median: 450000.000 Sigma: 367362.232 Max: 7700000.000
pretty_dist(data, 'price', xlimits=(0, 2e6))
Min: 75000.000 Mean: 540182.159 Median: 450000.000 Sigma: 367362.232 Max: 7700000.000
#number of bedrooms
labeled_barplot(data, 'room_bed', perc=True)
fivepoint_stats(data, 'room_bed')
sns.boxplot(data['room_bed'])
Min: 0.000 Mean: 3.371 Median: 3.000 Sigma: 0.930 Max: 33.000
<AxesSubplot:xlabel='room_bed'>
#number of bedrooms
labeled_barplot(data, 'room_bath', perc=True)
fivepoint_stats(data, 'room_bath')
sns.boxplot(data['room_bath'])
Min: 0.000 Mean: 2.115 Median: 2.250 Sigma: 0.770 Max: 8.000
<AxesSubplot:xlabel='room_bath'>
histogram_boxplot(data, 'living_measure')
pretty_dist(data, 'living_measure')
Min: 290.000 Mean: 2079.861 Median: 1910.000 Sigma: 918.496 Max: 13540.000
histogram_boxplot(data, 'lot_measure')
pretty_dist(data, 'lot_measure')
Min: 520.000 Mean: 15104.583 Median: 7618.000 Sigma: 41423.619 Max: 1651359.000
pretty_dist(data, 'lot_measure', xlimits=(0, .25e6))
Min: 520.000 Mean: 15104.583 Median: 7618.000 Sigma: 41423.619 Max: 1651359.000
pretty_dist(data, 'lot_measure', xlimits=(0, 75000))
Min: 520.000 Mean: 15104.583 Median: 7618.000 Sigma: 41423.619 Max: 1651359.000
labeled_barplot(data, 'ceil')
labeled_barplot(data, 'ceil', perc=True)
labeled_barplot(data, 'coast', perc=True)
labeled_barplot(data, 'sight', perc=True)
labeled_barplot(data, 'condition', perc=True)
histogram_boxplot(data, 'ceil_measure')
pretty_dist(data, 'ceil_measure')
Min: 290.000 Mean: 1788.367 Median: 1560.000 Sigma: 828.103 Max: 9410.000
histogram_boxplot(data,'basement')
pretty_dist(data,'basement')
Min: 0.000 Mean: 291.523 Median: 0.000 Sigma: 442.581 Max: 4820.000
labeled_barplot(data, 'basement', perc=True)
#do not include homes without basements to try to gain better insight on the statistics for basement sizes.
fivepoint_stats(data[data['basement'] > 0], 'basement')
Min: 10.000 Mean: 742.357 Median: 700.000 Sigma: 405.112 Max: 4820.000
pretty_dist(data[data['basement'] > 0], 'basement')
Min: 10.000 Mean: 742.357 Median: 700.000 Sigma: 405.112 Max: 4820.000
#inspecting year built - let's convert this to numerical for easier visualization
data['yr_built'] = pd.to_datetime(data['yr_built'], format='%Y', errors='coerce')
sns.histplot(data['yr_built'].dt.year)
<AxesSubplot:xlabel='yr_built', ylabel='Count'>
data["yr_built"].dt.year.value_counts() / data['yr_built'].shape * 100
2014.0 2.586406 2006.0 2.100588 2005.0 2.082080 2004.0 2.003424 2003.0 1.947902 2007.0 1.929394 1977.0 1.929394 1978.0 1.790589 1968.0 1.762828 2008.0 1.698052 1967.0 1.619396 1979.0 1.587008 1959.0 1.545366 1990.0 1.475964 1962.0 1.443576 2001.0 1.411188 1954.0 1.411188 1987.0 1.355666 1989.0 1.337158 1969.0 1.295517 1988.0 1.249248 1955.0 1.244621 1999.0 1.226114 1947.0 1.216860 1963.0 1.184472 1976.0 1.170592 1966.0 1.156711 1950.0 1.156711 1994.0 1.152084 1960.0 1.142831 1980.0 1.110443 1998.0 1.105816 1948.0 1.087309 2009.0 1.064174 1984.0 1.059547 1951.0 1.059547 1985.0 1.054921 1958.0 1.036413 1991.0 1.036413 1961.0 1.036413 1942.0 1.031786 2002.0 1.027160 1953.0 1.027160 1952.0 1.017906 2000.0 1.008652 1986.0 0.994772 1983.0 0.976264 1993.0 0.934623 2013.0 0.929996 1981.0 0.920742 1957.0 0.916115 1956.0 0.916115 1992.0 0.916115 1996.0 0.902235 1949.0 0.902235 1975.0 0.874474 1965.0 0.865220 1926.0 0.832832 1997.0 0.818952 1964.0 0.795817 1943.0 0.786564 2012.0 0.786564 1995.0 0.781937 1925.0 0.763429 1974.0 0.749549 1941.0 0.744922 1940.0 0.721788 1972.0 0.689400 1973.0 0.684773 2010.0 0.661639 1944.0 0.647758 1924.0 0.633878 1910.0 0.619997 1970.0 0.610744 2011.0 0.601490 1928.0 0.582982 1946.0 0.582982 1918.0 0.550595 1927.0 0.532087 1929.0 0.527460 1939.0 0.490446 1982.0 0.485819 1971.0 0.481192 1920.0 0.453431 1922.0 0.439550 1945.0 0.434923 1909.0 0.434923 1906.0 0.425670 1930.0 0.416416 1919.0 0.407162 1900.0 0.402536 1908.0 0.397909 1923.0 0.388655 1912.0 0.365521 1916.0 0.365521 1921.0 0.351640 1905.0 0.342387 1911.0 0.337760 1937.0 0.314625 1907.0 0.300745 1915.0 0.296118 1931.0 0.282238 1913.0 0.272984 1917.0 0.259103 1914.0 0.249850 1938.0 0.240596 1903.0 0.212835 1904.0 0.208208 1936.0 0.185074 1932.0 0.175820 2015.0 0.171193 1933.0 0.138805 1901.0 0.134179 1902.0 0.124925 1935.0 0.111044 1934.0 0.097164 Name: yr_built, dtype: float64
data["yr_built"].min()
Timestamp('1900-01-01 00:00:00')
data["yr_built"].max()
Timestamp('2015-01-01 00:00:00')
data['yr_renovated'].value_counts() / data['yr_renovated'].shape * 100
0 95.771064 2014 0.421043 2013 0.171193 2003 0.166566 2007 0.161940 2000 0.161940 2005 0.161940 2004 0.120298 1990 0.115671 2006 0.111044 2002 0.101791 1989 0.101791 2009 0.101791 1991 0.092537 2001 0.087910 1993 0.087910 1994 0.087910 1998 0.087910 1983 0.083283 1984 0.083283 1987 0.083283 2008 0.083283 2010 0.083283 1985 0.078656 1986 0.078656 1992 0.078656 1999 0.078656 2015 0.074030 1995 0.074030 1996 0.069403 1988 0.069403 1997 0.069403 2011 0.060149 1980 0.050895 1982 0.050895 2012 0.050895 1979 0.046268 1970 0.041642 1977 0.037015 1968 0.037015 1978 0.027761 1975 0.027761 1958 0.023134 1973 0.023134 1981 0.023134 1965 0.023134 1964 0.023134 1963 0.018507 1972 0.018507 1969 0.018507 1960 0.018507 1945 0.013881 1976 0.013881 1957 0.013881 1974 0.013881 1955 0.013881 1956 0.013881 1953 0.013881 1950 0.009254 1946 0.009254 1962 0.009254 1940 0.009254 1971 0.009254 1967 0.009254 1934 0.004627 1944 0.004627 1948 0.004627 1959 0.004627 1951 0.004627 1954 0.004627 Name: yr_renovated, dtype: float64
labeled_barplot(data, 'yr_renovated', perc=True)
fivepoint_stats(data, 'living_measure15')
histogram_boxplot(data, 'living_measure15')
Min: 399.000 Mean: 1987.066 Median: 1840.000 Sigma: 685.520 Max: 6210.000
histogram_boxplot(data, 'lot_measure15')
pretty_dist(data, 'lot_measure15', xlimits=(0, 50000))
Min: 651.000 Mean: 12766.543 Median: 7620.000 Sigma: 27286.987 Max: 871200.000
labeled_barplot(data, 'furnished', perc=True)
#convert total area to numerical data so that it can be interpreted
data['total_area'] = data['total_area'].astype(float, errors='ignore')
data['total_area'].describe()
count 21584 unique 11145 top $ freq 39 Name: total_area, dtype: object
#experiencing errors for this feature due to the '$' symbols, will ignore these missing values when trying to visualize
#the data
total_area_cleaned = data['total_area'][data['total_area'] != '$']
sns.histplot(total_area_cleaned)
<AxesSubplot:xlabel='total_area', ylabel='Count'>
sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x1fd952062e0>
sns.heatmap(data.corr(), vmin=-1.0, vmax=1.0, annot=True)
<AxesSubplot:>
sns.boxplot(x='sale_year', y='price', data=data)
<AxesSubplot:xlabel='sale_year', ylabel='price'>
sns.boxplot(x='room_bath', y='price', data=data)
<AxesSubplot:xlabel='room_bath', ylabel='price'>
sns.boxplot(x='room_bed', y='price', data=data)
<AxesSubplot:xlabel='room_bed', ylabel='price'>
sns.boxplot(x='furnished', y='price', data=data)
<AxesSubplot:xlabel='furnished', ylabel='price'>
#effect of age of the house at time of sale on the house price
data["sale_age"] = data["dayhours"].dt.year - data["yr_built"].dt.year
histogram_boxplot(data, 'sale_age')
pretty_dist(data, 'sale_age')
Min: -1.000 Mean: 43.314 Median: 40.000 Sigma: 29.376 Max: 115.000
sns.scatterplot(x='sale_age', y='price', data=data)
<AxesSubplot:xlabel='sale_age', ylabel='price'>